/*
 * NVMe storage driver for depthcharge
 * Copyright (c) 2015, Intel Corporation.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 */

/* Documentation:
 * This driver implements a minimal subset of the NVMe 1.0e specification.
 * (nvmexpress.org) It is designed to balance simplicity and performance.
 * Therefore it operates by polling the NVMe Completion Queue (CQ) for phase
 * changes rather than utilizing interrupts. The initialization functions are
 * processed one at a time, therefore the Admin Queue pair only supports depth
 * 2.
 * This driver is limited to a single IO queue pair (in addition to the
 * mandatory Admin queue pair). The IO queue depth is configurable, but has
 * shallow defaults to minimize host memory consumption. This driver only
 * supports a maximum of one PRP List, limiting the maximum transfer size to
 * 2MB (assuming 4KB memory pages).
 *
 * Operation:
 * At initialization this driver allocates a pool of host memory and overlays
 * the queue pair structures. It also statically allocates a block of memory
 * for a PRP List, avoiding the need to allocate/free memory at IO time.
 * Each identified NVMe namespace has a corresponding depthcharge BlockDev
 * structure, effectively creating a new "drive" visible to higher levels.
 *
 * The depthcharge read/write callbacks split host requests into chunks
 * satisfying the NVMe device's maximum transfer size limitations. Then they
 * call the corresponding _internal_ functions to facilitate formatting of the
 * NVMe structures in host memory. After all of the commands have been created
 * in host memory the Submission Queue tail pointer is updated allowing the
 * drive to process the newly submitted commands. Queuing commands allows the
 * drive to internally optimize accesses, increasing performance. Finally, the
 * Completion Queue phase bit is polled until it inverts, indicating that the
 * command has completed. If the SQ is full, outstanding commands will be
 * completed before the _internal_ function proceeds. This situation reduces
 * effective performance and should be avoided by increasing SQ/CQ depth.
 */

#include <assert.h>
#include <endian.h>
#include <libpayload.h>
#include <stdint.h>
#include <stdio.h>

#include "base/cleanup_funcs.h"
#include "drivers/storage/blockdev.h"
#include "drivers/storage/nvme.h"

/* Read 64bits from register space */
static uint64_t readll(uintptr_t _a)
{
	uint64_t _v;
	uint32_t *v = (uint32_t *)&_v;

	v[0] = readl(_a);
	v[1] = readl(_a + sizeof(uint32_t));
	return le64toh(_v);
}

/* Write 64bits to register space */
static void writell(uint64_t _v, volatile const uintptr_t _a)
{
	uint32_t *v = (uint32_t *)&_v;

	_v = htole64(_v);
	writel(v[0], _a);
	writel(v[1], _a + sizeof(uint32_t));
}

DEBUG(
static void nvme_dump_status(NVME_CQ volatile *cq) {
	printf("Dump NVMe Completion Entry Status from [%p]:\n", cq);

	printf("  SQ ID : [0x%x], Phase Tag : [%d], Cmd ID : [0x%x] Flags : [0x%x]\n",
			cq->sqid, cq->flags & NVME_CQ_FLAGS_PHASE, cq->cid, cq->flags);

	if (NVME_CQ_FLAGS_SCT(cq->flags) == 0) {
		if (NVME_CQ_FLAGS_SC(cq->flags) == 0)
			printf("  NVMe Cmd Execution Result - Successful\n");
		else
			printf("  NVMe Cmd Execution Result - error sc=%u\n",NVME_CQ_FLAGS_SC(cq->flags));
	} else
		printf("   NVMe Cmd Execution Result - error sct=%u\n",NVME_CQ_FLAGS_SCT(cq->flags));
}
) //DEBUG

/* Disables and resets the NVMe controller */
static NVME_STATUS nvme_disable_controller(NvmeCtrlr *ctrlr) {
	NVME_CC cc;
	uint32_t timeout;

	/* Read controller configuration */
	cc = readl(ctrlr->ctrlr_regs + NVME_CC_OFFSET);
	CLR(cc, NVME_CC_EN);
	/* Write controller configuration */
	writel_with_flush(cc, ctrlr->ctrlr_regs + NVME_CC_OFFSET);
	/* Delay up to CAP.TO ms for CSTS.RDY to clear*/
	if (NVME_CAP_TO(ctrlr->cap) == 0)
		timeout = 1;
	else
		timeout = NVME_CAP_TO(ctrlr->cap);

	if (WAIT_WHILE(
		((readl(ctrlr->ctrlr_regs + NVME_CSTS_OFFSET) & NVME_CSTS_RDY) == 1),
		timeout)) {
		return NVME_TIMEOUT;
	}

	return NVME_SUCCESS;
}

/* Enables controller and verifies that it's ready */
static NVME_STATUS nvme_enable_controller(NvmeCtrlr *ctrlr) {
	NVME_CC cc = 0;
	uint32_t timeout;

	SET(cc, NVME_CC_EN);
	cc |= NVME_CC_IOSQES(6); /* Spec. recommended values */
	cc |= NVME_CC_IOCQES(4); /* Spec. recommended values */
	/* Write controller configuration. */
	writel_with_flush(cc, ctrlr->ctrlr_regs + NVME_CC_OFFSET);

	/* Delay up to CAP.TO ms for CSTS.RDY to set*/
	if (NVME_CAP_TO(ctrlr->cap) == 0)
		timeout = 1;
	else
		timeout = NVME_CAP_TO(ctrlr->cap);

	if (WAIT_WHILE(
		((readl(ctrlr->ctrlr_regs + NVME_CSTS_OFFSET) & NVME_CSTS_RDY) == 0),
		timeout)) {
		return NVME_TIMEOUT;
	}

	return NVME_SUCCESS;
}

/* Add command to host SQ, don't write to HW SQ yet
 *
 * ctrlr: NVMe controller handle
 * qid: Queue Identifier for the SQ/CQ containing the new command
 * sqsize: Size of the submission queue
 */
static NVME_STATUS nvme_submit_cmd(NvmeCtrlr *ctrlr, uint16_t qid, uint32_t sqsize) {
	if (NULL == ctrlr)
		return NVME_INVALID_PARAMETER;
	if (qid > NVME_NUM_IO_QUEUES)
		return NVME_INVALID_PARAMETER;

	/* Update the submission queue tail in host memory */
	if (++(ctrlr->sq_t_dbl[qid]) > (sqsize-1))
		ctrlr->sq_t_dbl[qid] = 0;

	return NVME_SUCCESS;
}

/* Ring SQ doorbell register, submitting all outstanding command to HW
 *
 * ctrlr: NVMe controller handle
 * qid: Queue Identifier for the SQ/CQ containing the new command
 */
static NVME_STATUS nvme_ring_sq_doorbell(NvmeCtrlr *ctrlr, uint16_t qid) {
	if (NULL == ctrlr)
		return NVME_INVALID_PARAMETER;
	if (qid > NVME_NUM_IO_QUEUES)
		return NVME_INVALID_PARAMETER;

	/* Ring SQ doorbell by writing SQ tail index to controller */
	writel_with_flush(ctrlr->sq_t_dbl[qid],
				ctrlr->ctrlr_regs +
				NVME_SQTDBL_OFFSET(qid, NVME_CAP_DSTRD(ctrlr->cap)));

	return NVME_SUCCESS;
}

/* Poll for completion of all commands from HW
 *
 * ctrlr: NVMe controller handle
 * qid: Queue Identifier for the SQ/CQ containing the new command
 * cqsize: Size of the completion queue
 * timeout_ms: How long in milliseconds to wait for command completion
 */
static NVME_STATUS nvme_complete_cmds_polled(NvmeCtrlr *ctrlr,
			uint16_t qid,
			uint32_t cqsize,
			uint32_t timeout_ms) {
	NVME_CQ *cq;
	uint32_t ncmds;

	if (NULL == ctrlr)
		return NVME_INVALID_PARAMETER;
	if (qid > NVME_NUM_IO_QUEUES)
		return NVME_INVALID_PARAMETER;
	if (timeout_ms == 0)
		timeout_ms = 1;

	/* We will complete all outstanding commands */
	if (ctrlr->cq_h_dbl[qid] < ctrlr->sq_t_dbl[qid])
		ncmds = ctrlr->sq_t_dbl[qid] - ctrlr->cq_h_dbl[qid];
	else
		ncmds = (cqsize - ctrlr->cq_h_dbl[qid]) + ctrlr->sq_t_dbl[qid];
	DEBUG(printf("nvme_complete_cmds_polled: completing %u commands\n",(unsigned)ncmds);)

	while (ncmds--) {
		cq  = ctrlr->cq_buffer[qid] + ctrlr->cq_h_dbl[qid];
		/* Wait for phase to change (or timeout) */
		if (WAIT_WHILE(
			((readw(&(cq->flags)) & NVME_CQ_FLAGS_PHASE) == ctrlr->pt[qid]),
			timeout_ms)) {
				printf("nvme_complete_cmds_polled: ERROR - timeout\n");
				return NVME_TIMEOUT;
		}

		/* Dump completion entry status for debugging. */
		DEBUG(nvme_dump_status(cq);)

		/* Update the doorbell, queue phase, and queue command id if necessary */
		if (++(ctrlr->cq_h_dbl[qid]) > (cqsize-1)) {
			ctrlr->cq_h_dbl[qid] = 0;
			ctrlr->pt[qid] ^= 1;
		}
		/* Update SQ head pointer */
		ctrlr->sqhd[qid] = cq->sqhd;
	}

	/* Ring the completion queue doorbell register*/
	writel_with_flush(ctrlr->cq_h_dbl[qid], ctrlr->ctrlr_regs + NVME_CQHDBL_OFFSET(qid, NVME_CAP_DSTRD(ctrlr->cap)));

	/* If the SQ is empty, reset cid to zero */
	if (ctrlr->sq_t_dbl[qid] == ctrlr->sqhd[qid])
		ctrlr->cid[qid] = 0;

	return NVME_SUCCESS;
}

/* Submit and complete 1 command by polling CQ for phase change
 * Rings SQ doorbell, polls waiting for completion, rings CQ doorbell
 *
 * ctrlr: NVMe controller handle
 * qid: Queue Identifier for the SQ/CQ containing the new command
 * sqsize: Number of commands (size) of the submission queue
 * cqsize: Number of commands (size) of the completion queue
 * timeout_ms: How long in milliseconds to wait for command completion
 */
static NVME_STATUS nvme_do_one_cmd_synchronous(NvmeCtrlr *ctrlr,
			uint16_t qid,
			uint32_t sqsize,
			uint32_t cqsize,
			uint32_t timeout_ms) {
	NVME_STATUS status = NVME_SUCCESS;

	if (NULL == ctrlr)
		return NVME_INVALID_PARAMETER;
	if (qid > NVME_NUM_IO_QUEUES)
		return NVME_INVALID_PARAMETER;
	if (timeout_ms == 0)
		timeout_ms = 1;

	/* This function should only be called when no commands are pending
	 * because it will complete all outstanding commands. */
	if (ctrlr->sq_t_dbl[qid] != ctrlr->sqhd[qid])
		printf("nvme_do_one_cmd_synchronous: warning, SQ not empty. All commands will be completed.\n");

	status = nvme_submit_cmd(ctrlr, qid, sqsize);
	if (NVME_ERROR(status)) {
		DEBUG(printf("nvme_do_one_cmd_synchronous: error %d submitting command\n",status);)
		return status;
	}

	status = nvme_ring_sq_doorbell(ctrlr, qid);
	if (NVME_ERROR(status)) {
		DEBUG(printf("nvme_do_one_cmd_synchronous: error %d ringing doorbell\n",status);)
		return status;
	}

	status = nvme_complete_cmds_polled(ctrlr, qid, cqsize, NVME_GENERIC_TIMEOUT);
	if (NVME_ERROR(status)) {
		DEBUG(printf("nvme_do_one_cmd_synchronous: error %d completing command\n",status);)
		return status;
	}

	return NVME_SUCCESS;
}

/* Sends Set Feature 07h to allocate count number of IO queues */
static NVME_STATUS nvme_set_queue_count(NvmeCtrlr *ctrlr, uint16_t count) {
	NVME_SQ *sq;
	int status = NVME_SUCCESS;

	if (count == 0)
		return NVME_INVALID_PARAMETER;

	sq  = ctrlr->sq_buffer[NVME_ADMIN_QUEUE_INDEX] + ctrlr->sq_t_dbl[NVME_ADMIN_QUEUE_INDEX];

	memset(sq, 0, sizeof(NVME_SQ));

	sq->opc = NVME_ADMIN_SETFEATURES_OPC;
	sq->cid = ctrlr->cid[NVME_ADMIN_QUEUE_INDEX]++;

	sq->cdw10 = NVME_ADMIN_SETFEATURES_NUMQUEUES;

	/* Count is a 0's based value, so subtract one */
	count--;
	/* Set count number of IO SQs and CQs */
	sq->cdw11 = count;
	sq->cdw11 |= (count << 16);

	status = nvme_do_one_cmd_synchronous(ctrlr,
				NVME_ADMIN_QUEUE_INDEX,
				NVME_ASQ_SIZE,
				NVME_ACQ_SIZE,
				NVME_GENERIC_TIMEOUT);

	return status;
}

/* Creates a single IO completion queue */
static NVME_STATUS nvme_create_cq(NvmeCtrlr *ctrlr, uint16_t qid, uint16_t qsize) {
	NVME_SQ *sq;
	int status = NVME_SUCCESS;

	sq  = ctrlr->sq_buffer[NVME_ADMIN_QUEUE_INDEX] + ctrlr->sq_t_dbl[NVME_ADMIN_QUEUE_INDEX];

	memset(sq, 0, sizeof(NVME_SQ));

	sq->opc = NVME_ADMIN_CRIOCQ_OPC;
	sq->cid = ctrlr->cid[NVME_ADMIN_QUEUE_INDEX]++;

	/* Only physically contiguous address supported */
	sq->prp[0] = (uintptr_t)virt_to_phys(ctrlr->cq_buffer[qid]);
	/* Set physically contiguous (PC) bit */
	sq->cdw11 = 1;

	sq->cdw10 |= NVME_ADMIN_CRIOCQ_QID(qid);
	sq->cdw10 |= NVME_ADMIN_CRIOCQ_QSIZE(qsize);

	status = nvme_do_one_cmd_synchronous(ctrlr,
				NVME_ADMIN_QUEUE_INDEX,
				NVME_ASQ_SIZE,
				NVME_ACQ_SIZE,
				NVME_GENERIC_TIMEOUT);

	return status;
}

/* Creates a single IO submission queue
 * NOTE: Assumes that completion queue ID == submission queue ID
 */
static NVME_STATUS nvme_create_sq(NvmeCtrlr *ctrlr, uint16_t qid, uint16_t qsize) {
	NVME_SQ *sq;
	int status = NVME_SUCCESS;

	sq  = ctrlr->sq_buffer[NVME_ADMIN_QUEUE_INDEX] + ctrlr->sq_t_dbl[NVME_ADMIN_QUEUE_INDEX];

	memset(sq, 0, sizeof(NVME_SQ));

	sq->opc = NVME_ADMIN_CRIOSQ_OPC;
	sq->cid = ctrlr->cid[NVME_ADMIN_QUEUE_INDEX]++;

	/* Only physically contiguous address supported */
	sq->prp[0] = (uintptr_t)virt_to_phys(ctrlr->sq_buffer[qid]);
	/* Set physically contiguous (PC) bit */
	sq->cdw11 = 1;
	sq->cdw11 |= NVME_ADMIN_CRIOSQ_CQID(qid);

	sq->cdw10 |= NVME_ADMIN_CRIOSQ_QID(qid);
	sq->cdw10 |= NVME_ADMIN_CRIOSQ_QSIZE(qsize);

	status = nvme_do_one_cmd_synchronous(ctrlr,
				NVME_ADMIN_QUEUE_INDEX,
				NVME_ASQ_SIZE,
				NVME_ACQ_SIZE,
				NVME_GENERIC_TIMEOUT);

	return status;
}

/* Generate PRPs for a single virtual memory buffer
 * prp_list: pre-allocated prp list buffer
 * prp: pointer to SQ PRP array
 * buffer: host buffer for request
 * size: number of bytes in request
 */
static NVME_STATUS nvme_fill_prp(PrpList *prp_list, uint64_t *prp, void *buffer, uint64_t size)
{
	uint64_t offset = (uintptr_t)buffer & (NVME_PAGE_SIZE - 1);
	uint64_t xfer_pages;
	uintptr_t buffer_phys = virt_to_phys(buffer);

	/* PRP0 is always the (potentially unaligned) start of the buffer */
	prp[0] = buffer_phys;
	/* Increment buffer to the next aligned page */
	if (ALIGN(buffer_phys,NVME_PAGE_SIZE) == buffer_phys)
		buffer_phys += NVME_PAGE_SIZE;
	else
		buffer_phys = ALIGN_UP(buffer_phys,NVME_PAGE_SIZE);

	/* Case 1: all data will fit in 2 PRP entries (accounting for buffer offset) */
	if ((size + offset) <= (2 * NVME_PAGE_SIZE)) {
		prp[1] = buffer_phys;
		return NVME_SUCCESS;
	}

	/* Case 2: Need to build up to one PRP List */
	xfer_pages = (ALIGN((size + offset), NVME_PAGE_SIZE) >> NVME_PAGE_SHIFT);
	/* Don't count first prp entry as it is the beginning of buffer */
	xfer_pages--;
	/* Make sure this transfer fits into one PRP list */
	if (xfer_pages > (NVME_MAX_XFER_BYTES/NVME_PAGE_SIZE))
		return NVME_INVALID_PARAMETER;

	/* Fill the PRP List */
	prp[1] = (uintptr_t)virt_to_phys(prp_list);
	for (uint32_t entry_index = 0; entry_index < xfer_pages; entry_index++) {
		prp_list->prp_entry[entry_index] = buffer_phys;
		buffer_phys += NVME_PAGE_SIZE;
	}
	return NVME_SUCCESS;
}

/* Sets up read operation for up to max_transfer blocks */
static NVME_STATUS nvme_internal_read(NvmeDrive *drive, void *buffer, lba_t start, lba_t count)
{
	NvmeCtrlr *ctrlr = drive->ctrlr;
	NVME_SQ *sq;
	int status = NVME_SUCCESS;

	if (count == 0)
		return NVME_INVALID_PARAMETER;

	/* If queue is full, need to complete inflight commands before submitting more */
	if ((ctrlr->sq_t_dbl[NVME_IO_QUEUE_INDEX] + 1) % ctrlr->iosq_sz == ctrlr->sqhd[NVME_IO_QUEUE_INDEX]) {
		DEBUG(printf("nvme_internal_read: Too many outstanding commands. Completing in-flights\n");)
		/* Submit commands to controller */
		nvme_ring_sq_doorbell(ctrlr, NVME_IO_QUEUE_INDEX);
		/* Complete submitted command(s) */
		status = nvme_complete_cmds_polled(ctrlr,
				NVME_IO_QUEUE_INDEX,
				NVME_CCQ_SIZE,
				NVME_GENERIC_TIMEOUT);
		if (NVME_ERROR(status)) {
			printf("nvme_internal_read: error %d completing outstanding commands\n",status);
			return status;
		}
	}

	sq  = ctrlr->sq_buffer[NVME_IO_QUEUE_INDEX] + ctrlr->sq_t_dbl[NVME_IO_QUEUE_INDEX];

	memset(sq, 0, sizeof(NVME_SQ));

	sq->opc = NVME_IO_READ_OPC;
	sq->cid = ctrlr->cid[NVME_IO_QUEUE_INDEX]++;
	sq->nsid = drive->namespace_id;

	status = nvme_fill_prp(ctrlr->prp_list[sq->cid], sq->prp, buffer, count * drive->dev.block_size);
	if (NVME_ERROR(status)) {
		printf("nvme_internal_read: error %d generating PRP(s)\n",status);
		return status;
	}

	sq->cdw10 = start;
	sq->cdw11 = (start >> 32);
	sq->cdw12 = (count - 1) & 0xFFFF;

	status = nvme_submit_cmd(ctrlr, NVME_IO_QUEUE_INDEX, ctrlr->iosq_sz);

	return status;
}

/* Read operation entrypoint
 * Cut operation into max_transfer chunks and do it
 */
static lba_t nvme_read(BlockDevOps *me, lba_t start, lba_t count, void *buffer)
{
	NvmeDrive *drive = container_of(me, NvmeDrive, dev.ops);
	NvmeCtrlr *ctrlr = drive->ctrlr;
	uint64_t max_transfer_blocks = 0;
	uint32_t block_size = drive->dev.block_size;
	lba_t orig_count = count;
	int status = NVME_SUCCESS;

	DEBUG(printf("nvme_read: Reading from namespace %d\n",drive->namespace_id);)

	if (ctrlr->controller_data->mdts != 0)
		max_transfer_blocks = ((1 << (ctrlr->controller_data->mdts)) * (1 << NVME_CAP_MPSMIN(ctrlr->cap))) / block_size;
	/* Artificially limit max_transfer_blocks to 1 PRP List */
	if ( (max_transfer_blocks == 0) ||
			(max_transfer_blocks > (NVME_MAX_XFER_BYTES * block_size)))
		max_transfer_blocks = NVME_MAX_XFER_BYTES / block_size;

	while (count > 0) {
		if (count > max_transfer_blocks) {
			DEBUG(printf("nvme_read: partial read of %llu blocks\n",(unsigned long long)max_transfer_blocks);)
			status = nvme_internal_read(drive, buffer, start, max_transfer_blocks);
			count -= max_transfer_blocks;
			buffer += max_transfer_blocks*block_size;
			start += max_transfer_blocks;
		} else {
			DEBUG(printf("nvme_read: final read of %llu blocks\n",(unsigned long long)count);)
			status = nvme_internal_read(drive, buffer, start, count);
			count = 0;
		}
		if (NVME_ERROR(status))
			break;
	}

	if (NVME_ERROR(status)) {
		printf("nvme_read: error %d\n",status);
		return -1;
	}

	/* Submit commands to controller */
	nvme_ring_sq_doorbell(ctrlr, NVME_IO_QUEUE_INDEX);
	/* Complete submitted command(s) */
	nvme_complete_cmds_polled(ctrlr,
			NVME_IO_QUEUE_INDEX,
			NVME_CCQ_SIZE,
			NVME_GENERIC_TIMEOUT);

	DEBUG(printf("nvme_read: lba = 0x%08x, Original = 0x%08x, Remaining = 0x%08x, BlockSize = 0x%x Status = %d\n", (uint32_t)start, (uint32_t)orig_count, (uint32_t)count, block_size, status);)

	return orig_count - count;
}

/* Sets up write operation for up to max_transfer blocks */
static NVME_STATUS nvme_internal_write(NvmeDrive *drive, void *buffer, lba_t start, lba_t count)
{
	NvmeCtrlr *ctrlr = drive->ctrlr;
	NVME_SQ *sq;
	int status = NVME_SUCCESS;

	if (count == 0)
		return NVME_INVALID_PARAMETER;

	/* If queue is full, need to complete inflight commands before submitting more */
	if ((ctrlr->sq_t_dbl[NVME_IO_QUEUE_INDEX] + 1) % ctrlr->iosq_sz == ctrlr->sqhd[NVME_IO_QUEUE_INDEX]) {
		DEBUG(printf("nvme_internal_write: Too many outstanding commands. Completing in-flights\n");)
		/* Submit commands to controller */
		nvme_ring_sq_doorbell(ctrlr, NVME_IO_QUEUE_INDEX);
		/* Complete submitted command(s) */
		status = nvme_complete_cmds_polled(ctrlr,
				NVME_IO_QUEUE_INDEX,
				NVME_CCQ_SIZE,
				NVME_GENERIC_TIMEOUT);
		if (NVME_ERROR(status)) {
			printf("nvme_internal_read: error %d completing outstanding commands\n",status);
			return status;
		}
	}

	sq  = ctrlr->sq_buffer[NVME_IO_QUEUE_INDEX] + ctrlr->sq_t_dbl[NVME_IO_QUEUE_INDEX];

	memset(sq, 0, sizeof(NVME_SQ));

	sq->opc = NVME_IO_WRITE_OPC;
	sq->cid = ctrlr->cid[NVME_IO_QUEUE_INDEX]++;
	sq->nsid = drive->namespace_id;

	status = nvme_fill_prp(ctrlr->prp_list[sq->cid], sq->prp, buffer, count * drive->dev.block_size);
	if (NVME_ERROR(status)) {
		printf("nvme_internal_write: error %d generating PRP(s)\n",status);
		return status;
	}

	sq->cdw10 = start;
	sq->cdw11 = (start >> 32);
	sq->cdw12 = (count - 1) & 0xFFFF;

	status = nvme_submit_cmd(ctrlr, NVME_IO_QUEUE_INDEX, ctrlr->iosq_sz);

	return status;
}

/* Write operation entrypoint
 * Cut operation into max_transfer chunks and do it
 */
static lba_t nvme_write(BlockDevOps *me, lba_t start, lba_t count,
						const void *buffer)
{
	NvmeDrive *drive = container_of(me, NvmeDrive, dev.ops);
	NvmeCtrlr *ctrlr = drive->ctrlr;
	uint64_t max_transfer_blocks = 0;
	uint32_t block_size = drive->dev.block_size;
	lba_t orig_count = count;
	int status = NVME_SUCCESS;

	DEBUG(printf("nvme_write: Writing to namespace %d\n",drive->namespace_id);)

	if (ctrlr->controller_data->mdts != 0)
		max_transfer_blocks = ((1 << (ctrlr->controller_data->mdts)) * (1 << NVME_CAP_MPSMIN(ctrlr->cap))) / block_size;
	/* Artificially limit max_transfer_blocks to 1 PRP List */
	if ( (max_transfer_blocks == 0) ||
			(max_transfer_blocks > (NVME_MAX_XFER_BYTES * block_size)))
		max_transfer_blocks = NVME_MAX_XFER_BYTES / block_size;

	while (count > 0) {
		if (count > max_transfer_blocks) {
			DEBUG(printf("nvme_write: partial write of %llu blocks\n",(unsigned long long)max_transfer_blocks);)
			status = nvme_internal_write(drive, (void *)buffer, start, max_transfer_blocks);
			count -= max_transfer_blocks;
			buffer += max_transfer_blocks*block_size;
			start += max_transfer_blocks;
		} else {
			DEBUG(printf("nvme_write final write of %llu blocks\n",(unsigned long long)count);)
			status = nvme_internal_write(drive, (void *)buffer, start, count);
			count = 0;
		}
		if (NVME_ERROR(status))
			break;
	}

	if (NVME_ERROR(status)) {
		printf("nvme_write: error %d\n",status);
		return -1;
	}

	/* Submit commands to controller */
	nvme_ring_sq_doorbell(ctrlr, NVME_IO_QUEUE_INDEX);
	/* Complete submitted command(s) */
	nvme_complete_cmds_polled(ctrlr,
			NVME_IO_QUEUE_INDEX,
			NVME_CCQ_SIZE,
			NVME_GENERIC_TIMEOUT);

	DEBUG(printf("nvme_write: lba = 0x%08x, Original = 0x%08x, Remaining = 0x%08x, BlockSize = 0x%x Status = %d\n", (uint32_t)start, (uint32_t)orig_count, (uint32_t)count, block_size, status);)

	return orig_count - count;
}

/* Sends the Identify command, saves result in ctrlr->controller_data*/
static NVME_STATUS nvme_identify(NvmeCtrlr *ctrlr) {
	NVME_SQ *sq;
	int status = NVME_SUCCESS;

	ctrlr->controller_data = dma_memalign(NVME_PAGE_SIZE, sizeof(NVME_ADMIN_CONTROLLER_DATA));
	if (ctrlr->controller_data == NULL) {
		printf("nvme_identify: ERROR - out of memory\n");
		return NVME_OUT_OF_RESOURCES;
	}

	sq  = ctrlr->sq_buffer[NVME_ADMIN_QUEUE_INDEX] + ctrlr->sq_t_dbl[NVME_ADMIN_QUEUE_INDEX];

	memset(sq, 0, sizeof(NVME_SQ));

	sq->opc = NVME_ADMIN_IDENTIFY_OPC;
	sq->cid = ctrlr->cid[NVME_ADMIN_QUEUE_INDEX]++;

	/* Identify structure is 4Kb in size. Fits in aligned 1 PAGE */
	sq->prp[0] = (uintptr_t)virt_to_phys(ctrlr->controller_data);
	/* Set bit 0 (Cns bit) to 1 to identify a controller */
	sq->cdw10 = 1;

	status = nvme_do_one_cmd_synchronous(ctrlr,
				NVME_ADMIN_QUEUE_INDEX,
				NVME_ASQ_SIZE,
				NVME_ACQ_SIZE,
				NVME_GENERIC_TIMEOUT);
	if (NVME_ERROR(status))
		return status;

	ctrlr->controller_data->sn[19] = 0;
	ctrlr->controller_data->mn[39] = 0;
	DEBUG(printf(" == NVME IDENTIFY CONTROLLER DATA ==\n");)
	DEBUG(printf("    PCI VID   : 0x%x\n", ctrlr->controller_data->vid);)
	DEBUG(printf("    PCI SSVID : 0x%x\n", ctrlr->controller_data->ssvid);)
	DEBUG(printf("    SN        : %s\n",   (char *)(ctrlr->controller_data->sn));)
	DEBUG(printf("    MN        : %s\n",   (char *)(ctrlr->controller_data->mn));)
	DEBUG(printf("    RAB       : 0x%x\n", ctrlr->controller_data->rab);)
	DEBUG(printf("    AERL      : 0x%x\n", ctrlr->controller_data->aerl);)
	DEBUG(printf("    SQES      : 0x%x\n", ctrlr->controller_data->sqes);)
	DEBUG(printf("    CQES      : 0x%x\n", ctrlr->controller_data->cqes);)
	DEBUG(printf("    NN        : 0x%x\n", ctrlr->controller_data->nn);)

	return status;
}

/* Sends the Identify Namespace command, creates NvmeDrives for each namespace */
static NVME_STATUS nvme_identify_namespaces(NvmeCtrlr *ctrlr) {
	NVME_SQ *sq;
	NVME_ADMIN_NAMESPACE_DATA *namespace_data = NULL;
	int status = NVME_SUCCESS;

	if (ctrlr->controller_data == NULL) {
		printf("nvme_identify_namespaces: ERROR - must complete Identify command first\n");
		return NVME_INVALID_PARAMETER;
	}

	namespace_data = dma_memalign(NVME_PAGE_SIZE, sizeof(NVME_ADMIN_NAMESPACE_DATA));
	if (namespace_data == NULL) {
		printf("nvme_identify_namespaces: ERROR - out of memory\n");
		return NVME_OUT_OF_RESOURCES;
	}

	for (uint32_t index = 1; index <= ctrlr->controller_data->nn; index++) {
		DEBUG(printf("nvme_identify_namespaces: Working on namespace %d\n",index);)

		sq  = ctrlr->sq_buffer[NVME_ADMIN_QUEUE_INDEX] + ctrlr->sq_t_dbl[NVME_ADMIN_QUEUE_INDEX];

		memset(sq, 0, sizeof(NVME_SQ));

		sq->opc = NVME_ADMIN_IDENTIFY_OPC;
		sq->cid = ctrlr->cid[NVME_ADMIN_QUEUE_INDEX]++;
		sq->nsid = index;

		/* Identify structure is 4Kb in size. Fits in 1 aligned PAGE */
		sq->prp[0] = (uintptr_t)virt_to_phys(namespace_data);
		/* Clear bit 0 (Cns bit) to identify a namespace */

		status = nvme_do_one_cmd_synchronous(ctrlr,
				NVME_ADMIN_QUEUE_INDEX,
				NVME_ASQ_SIZE,
				NVME_ACQ_SIZE,
				NVME_GENERIC_TIMEOUT);
		if (NVME_ERROR(status))
			goto exit;

		DEBUG(printf(" == NVME IDENTIFY NAMESPACE [%d] DATA ==\n", index);)
		DEBUG(printf("    NSZE        : 0x%llx\n", namespace_data->nsze);)
		DEBUG(printf("    NCAP        : 0x%llx\n", namespace_data->ncap);)
		DEBUG(printf("    NUSE        : 0x%llx\n", namespace_data->nuse);)
		DEBUG(printf("    LBAF0.LBADS : 0x%x\n", (namespace_data->lba_format[0].lbads));)

		if (namespace_data->ncap == 0) {
			printf("nvme_identify_namespaces: ERROR - namespace %d has zero capacity\n", index);
			status = NVME_DEVICE_ERROR;
			goto exit;
		} else {
			/* Create drive node. */
			NvmeDrive *nvme_drive = xzalloc(sizeof(*nvme_drive));
			static const int name_size = 21;
			char *name = xmalloc(name_size);
			snprintf(name, name_size, "NVMe Namespace %d", index);
			nvme_drive->dev.ops.read = &nvme_read;
			nvme_drive->dev.ops.write = &nvme_write;
			nvme_drive->dev.ops.new_stream = &new_simple_stream;
			nvme_drive->dev.name = name;
			nvme_drive->dev.removable = 0;
			nvme_drive->dev.block_size = 2 << (namespace_data->lba_format[namespace_data->flbas & 0xF].lbads - 1);
			nvme_drive->dev.block_count = namespace_data->nsze;
			nvme_drive->ctrlr = ctrlr;
			nvme_drive->namespace_id = index;
			list_insert_after(&nvme_drive->dev.list_node,
								&fixed_block_devices);
			list_insert_after(&nvme_drive->list_node, &ctrlr->drives);
			printf("Added NVMe drive \"%s\" lbasize:%d, count:0x%llx\n", nvme_drive->dev.name, nvme_drive->dev.block_size, (uint64_t)nvme_drive->dev.block_count);
		}
	}

exit:
	if (namespace_data != NULL)
		free(namespace_data);

	return status;
}

/* Initialization entrypoint */
static int nvme_ctrlr_init(BlockDevCtrlrOps *me)
{
	NvmeCtrlr *ctrlr = container_of(me, NvmeCtrlr, ctrlr.ops);
	pcidev_t dev = ctrlr->dev;
	int status = NVME_SUCCESS;

	if ((pci_read_config8(ctrlr->dev, REG_PROG_IF) != PCI_IF_NVMHCI)
		|| (pci_read_config8(ctrlr->dev, REG_SUBCLASS) != PCI_CLASS_MASS_STORAGE_NVM)
		|| (pci_read_config8(ctrlr->dev, REG_CLASS) != PCI_CLASS_MASS_STORAGE)) {
		printf("Unsupported NVMe controller found\n");
		status = NVME_UNSUPPORTED;
		goto exit;
	}

	printf("Initializing NVMe controller %04x:%04x\n",
		pci_read_config16(ctrlr->dev, REG_VENDOR_ID),
		pci_read_config16(ctrlr->dev, REG_DEVICE_ID));

	pci_set_bus_master(dev);

	/* Read the Controller Capabilities register */
	ctrlr->ctrlr_regs = pci_read_resource(dev,0);
	ctrlr->ctrlr_regs = ctrlr->ctrlr_regs & ~0x7;
	ctrlr->cap = readll(ctrlr->ctrlr_regs + NVME_CAP_OFFSET);

	/* Verify that the NVM command set is supported */
	if (NVME_CAP_CSS(ctrlr->cap) != NVME_CAP_CSS_NVM) {
		printf("NVMe Cap CSS not NVMe (CSS=%01x. Unsupported controller.\n",(uint8_t)NVME_CAP_CSS(ctrlr->cap));
		status = NVME_UNSUPPORTED;
		goto exit;
	}

	/* Driver only supports 4k page size */
	if (NVME_CAP_MPSMIN(ctrlr->cap) > NVME_PAGE_SHIFT) {
		printf("NVMe driver only supports 4k page size. Unsupported controller.\n");
		status = NVME_UNSUPPORTED;
		goto exit;
	}

	/* Calculate max io sq/cq sizes based on MQES */
	ctrlr->iosq_sz = (NVME_CSQ_SIZE > NVME_CAP_MQES(ctrlr->cap)) ? NVME_CAP_MQES(ctrlr->cap) : NVME_CSQ_SIZE;
	ctrlr->iocq_sz = (NVME_CCQ_SIZE > NVME_CAP_MQES(ctrlr->cap)) ? NVME_CAP_MQES(ctrlr->cap) : NVME_CCQ_SIZE;
	DEBUG(printf("iosq_sz = %u, iocq_sz = %u\n",ctrlr->iosq_sz,ctrlr->iocq_sz);)

	/* Allocate enough PRP List memory for max queue depth commands */
	for (unsigned int list_index = 0; list_index < ctrlr->iosq_sz; list_index++) {
		ctrlr->prp_list[list_index] = dma_memalign(NVME_PAGE_SIZE, NVME_PAGE_SIZE);
		if (!(ctrlr->prp_list[list_index])) {
			printf("NVMe driver failed to allocate prp list %u memory\n",list_index);
			status = NVME_OUT_OF_RESOURCES;
			goto exit;
		}
		memset(ctrlr->prp_list[list_index], 0, NVME_PAGE_SIZE);
	}

	/* Allocate queue memory block */
	ctrlr->buffer = dma_memalign(NVME_PAGE_SIZE, (NVME_NUM_QUEUES * 2) * NVME_PAGE_SIZE);
	if (!(ctrlr->buffer)) {
		printf("NVMe driver failed to allocate queue buffer\n");
		status = NVME_OUT_OF_RESOURCES;
		goto exit;
	}
	memset(ctrlr->buffer, 0, (NVME_NUM_QUEUES * 2) * NVME_PAGE_SIZE);

	/* Disable controller */
	status = nvme_disable_controller(ctrlr);
	if (NVME_ERROR(status))
		goto exit;

	/* Create Admin queue pair */
	NVME_AQA aqa = 0;
	NVME_ASQ asq = 0;
	NVME_ACQ acq = 0;

	/* Verify defined queue sizes are within NVME_PAGE_SIZE limits */
	#if NVME_ASQ_SIZE != 2
	#error "Unsupported Admin SQ size defined"
	#endif
	#if NVME_ACQ_SIZE != 2
	#error "Unsupported Admin CQ size defined"
	#endif
	#if (NVME_CSQ_SIZE < 2) || (NVME_CSQ_SIZE > (NVME_PAGE_SIZE / 64))
	#error "Unsupported IO SQ size defined"
	#endif
	#if (NVME_CCQ_SIZE < 2) || (NVME_CCQ_SIZE > (NVME_PAGE_SIZE / 64))
	#error "Unsupported IO CQ size defined"
	#endif

	/* Set number of entries Admin submission & completion queues. */
	aqa |= NVME_AQA_ASQS(NVME_ASQ_SIZE);
	aqa |= NVME_AQA_ACQS(NVME_ACQ_SIZE);
	/* Address of Admin submission queue. */
	asq  = (uintptr_t)virt_to_phys(ctrlr->buffer);
	ctrlr->sq_buffer[NVME_ADMIN_QUEUE_INDEX] = (NVME_SQ *)ctrlr->buffer;
	/* Address of Admin completion queue. */
	acq  = (uintptr_t)virt_to_phys(ctrlr->buffer + NVME_PAGE_SIZE);
	ctrlr->cq_buffer[NVME_ADMIN_QUEUE_INDEX] = (NVME_CQ *)(ctrlr->buffer + NVME_PAGE_SIZE);
	/* Address of I/O submission & completion queues */
	ctrlr->sq_buffer[NVME_IO_QUEUE_INDEX] =
		(NVME_SQ *)(ctrlr->buffer + 2 * NVME_PAGE_SIZE);
	ctrlr->cq_buffer[NVME_IO_QUEUE_INDEX] =
		(NVME_CQ *)(ctrlr->buffer + 3 * NVME_PAGE_SIZE);

	DEBUG(printf("Private->Buffer = [%p]\n", (void *)virt_to_phys(ctrlr->buffer));)
	DEBUG(printf("Admin Queue Attributes = [%X]\n", aqa);)
	DEBUG(printf("Admin Submission Queue (sq_buffer[ADMIN]) = [%p]\n", (void *)virt_to_phys(ctrlr->sq_buffer[NVME_ADMIN_QUEUE_INDEX]));)
	DEBUG(printf("Admin Completion Queue (cq_buffer[ADMIN]) = [%p]\n", (void *)virt_to_phys(ctrlr->cq_buffer[NVME_ADMIN_QUEUE_INDEX]));)
	DEBUG(printf("I/O   Submission Queue (sq_buffer[NVME_IO_QUEUE]) = [%p]\n", (void *)virt_to_phys(ctrlr->sq_buffer[NVME_IO_QUEUE_INDEX]));)
	DEBUG(printf("I/O   Completion Queue (cq_buffer[NVME_IO_QUEUE]) = [%p]\n", (void *)virt_to_phys(ctrlr->cq_buffer[NVME_IO_QUEUE_INDEX]));)

	/* Write AQA */
	writel(aqa, ctrlr->ctrlr_regs + NVME_AQA_OFFSET);
	/* Write ASQ */
	writell(asq, ctrlr->ctrlr_regs + NVME_ASQ_OFFSET);
	/* Write ACQ */
	writell(acq, ctrlr->ctrlr_regs + NVME_ACQ_OFFSET);

	/* Enable controller */
	status = nvme_enable_controller(ctrlr);
	if (NVME_ERROR(status))
		goto exit;

	/* Set IO queue count */
	status = nvme_set_queue_count(ctrlr, NVME_NUM_IO_QUEUES);
	if (NVME_ERROR(status))
		goto exit;

	/* Create IO queue pair */
	status = nvme_create_cq(ctrlr, NVME_IO_QUEUE_INDEX, ctrlr->iocq_sz);
	if (NVME_ERROR(status))
		goto exit;

	status = nvme_create_sq(ctrlr, NVME_IO_QUEUE_INDEX, ctrlr->iosq_sz);
	if (NVME_ERROR(status))
		goto exit;

	/* Identify */
	status = nvme_identify(ctrlr);
	if (NVME_ERROR(status))
		goto exit;

	/* Identify Namespace and create drive nodes */
	status = nvme_identify_namespaces(ctrlr);
	if (NVME_ERROR(status))
		goto exit;

exit:
	ctrlr->ctrlr.need_update = 0;

	return NVME_ERROR(status);
}

static int nvme_shutdown(struct CleanupFunc *cleanup, CleanupType type)
{
	NvmeCtrlr *ctrlr = (NvmeCtrlr *)cleanup->data;
	NvmeDrive *drive;
	int status = NVME_SUCCESS;

	printf("Shutting down NVMe controller.\n");

	if (NULL == ctrlr)
		return 1;

	/* Only disable controller if initialized */
	if (ctrlr->ctrlr.need_update != 1) {
		status = nvme_disable_controller(ctrlr);
		if (NVME_ERROR(status))
			return 1;
	}

	list_for_each(drive, ctrlr->drives, list_node) {
		free(drive);
	}
	free(ctrlr->controller_data);
	free(ctrlr->prp_list);
	free(ctrlr->buffer);
	free(ctrlr);
	return 0;
}

/* Setup controller initialization/shutdown callbacks.
 * Used in board.c to get handle to new ctrlr.
 */
NvmeCtrlr *new_nvme_ctrlr(pcidev_t dev)
{
	NvmeCtrlr *ctrlr = xzalloc(sizeof(*ctrlr));
	static CleanupFunc cleanup = {
		&nvme_shutdown,
		CleanupOnHandoff | CleanupOnLegacy,
		NULL
	};

	assert(cleanup.data == NULL);

	printf("New NVMe Controller %p @ %02x:%02x:%02x\n",
		ctrlr, PCI_BUS(dev),PCI_SLOT(dev),PCI_FUNC(dev));

	ctrlr->ctrlr.ops.update = &nvme_ctrlr_init;
	ctrlr->ctrlr.need_update = 1;
	ctrlr->dev = dev;
	cleanup.data = (void *)ctrlr;
	list_insert_after(&cleanup.list_node, &cleanup_funcs);

	return ctrlr;
}
